{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "cb37aecd-4bba-4369-a8c6-ecfe3b2ee344",
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext dotenv\n",
    "%dotenv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "3ce78974-f3cc-4309-8a89-8f5040f57476",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_community.graphs import Neo4jGraph\n",
    "from langchain_openai import ChatOpenAI\n",
    "from langchain.prompts import ChatPromptTemplate\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8f316081-d0be-4ef1-b310-306fe7153765",
   "metadata": {},
   "source": [
    "# Basic model\n",
    "\n",
    "* enum for node labels and rel types\n",
    "* no node or rel properties\n",
    "* most openai models are terrible at following enum, so we might introduce optional postfiltering\n",
    "* not limiting rel types between specific node labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "be3e49a8-bf1e-45cd-a6cd-dcde38600eac",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ[\"NEO4J_URI\"] = 'neo4j+s://demo.neo4jlabs.com'\n",
    "os.environ[\"NEO4J_USERNAME\"] = 'recommendations'\n",
    "os.environ[\"NEO4J_PASSWORD\"] = 'recommendations'\n",
    "os.environ[\"NEO4J_DATABASE\"] = 'recommendations'\n",
    "os.environ[\"OPENAI_API_KEY\"] = 'sk-'\n",
    "\n",
    "graph = Neo4jGraph()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "26501c9e-f698-4ef7-8380-2048cb5ed8d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "llm=ChatOpenAI(temperature=0, model=\"gpt-4-0125-preview\")\n",
    "\n",
    "prompt = ChatPromptTemplate.from_messages(\n",
    "        [(\n",
    "          \"system\",\n",
    "          \"\"\"# Knowledge Graph Instructions for GPT-4\n",
    "## 1. Overview\n",
    "You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.\n",
    "- **Nodes** represent entities and concepts.\n",
    "- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.\n",
    "## 2. Labeling Nodes\n",
    "- **Consistency**: Ensure you use available types for node labels.\n",
    "  - If there are provided available values for node labels, use only those and nothing else. If the entity doesn't fit any of the included labels, do not return the entity!\n",
    "  - **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.\n",
    "## 3. Handling Numerical Data and Dates\n",
    "- Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.\n",
    "- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.\n",
    "- **Property Format**: Properties must be in a key-value format.\n",
    "- **Quotation Marks**: Never use escaped single or double quotes within property values.\n",
    "- **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.\n",
    "## 4. Coreference Resolution\n",
    "- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.\n",
    "If an entity, such as \"John Doe\", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., \"Joe\", \"he\"),\n",
    "always use the most complete identifier for that entity throughout the knowledge graph. In this example, use \"John Doe\" as the entity ID.\n",
    "Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.\n",
    "## 5. Strict Compliance\n",
    "Adhere to the rules strictly. Non-compliance will result in termination.\n",
    "          \"\"\"),\n",
    "            (\"human\", \"Use the given format to extract information from the following input: {input}\"),\n",
    "            (\"human\", \"Tip: Make sure to answer in the correct format and do not include any \"),\n",
    "        ])\n",
    " "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "7d34c836-410d-4e8f-baaa-762a0d99e4ed",
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import List, Union, Optional\n",
    "\n",
    "from langchain_core.documents import Document\n",
    "from langchain_core.pydantic_v1 import BaseModel, Field\n",
    "\n",
    "def optional_enum_field(enum_values: Optional[List[str]] = None, **field_kwargs):\n",
    "    \"\"\"Utility function to conditionally create a field with an enum constraint.\"\"\"\n",
    "    if enum_values:\n",
    "        return Field(..., enum=enum_values, **field_kwargs)\n",
    "    else:\n",
    "        return Field(..., **field_kwargs)\n",
    "\n",
    "def create_simple_model(node_labels: Optional[List[str]] = None, rel_types:Optional[List[str]] = None) -> BaseModel:\n",
    "    \"\"\"\n",
    "    Simple model allows to limit node and/or relationship types.\n",
    "    Doesn't have any node or relationship properties.\n",
    "    \"\"\"\n",
    "    class Node(BaseModel):\n",
    "        \"\"\"Represents a node in a graph with associated properties.\n",
    "        \"\"\"\n",
    "        id: str = Field(description=\"A unique identifier for the node.\")\n",
    "        type: str = optional_enum_field(node_labels, description=\"The type or label of the node.\")\n",
    "    \n",
    "    class Relationship(BaseModel):\n",
    "        \"\"\"Represents a directed relationship between two nodes in a graph.\n",
    "        \"\"\"\n",
    "    \n",
    "        source: Node = Field(description=\"The source node of the relationship.\")\n",
    "        target: Node = Field(description=\"The target node of the relationship.\")\n",
    "        type: str = optional_enum_field(rel_types, description=\"The type of the relationship.\")\n",
    "    \n",
    "    class Graph(BaseModel):\n",
    "        \"\"\"Represents a graph document consisting of nodes and relationships.\n",
    "        \"\"\"\n",
    "    \n",
    "        nodes: Optional[List[Node]] = Field(description=\"List of nodes\")\n",
    "        relationships: Optional[List[Relationship]] = Field(description=\"List of relationships\")\n",
    "\n",
    "    return Graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "d7d14c8a-04fe-4c05-a857-705cfb7144fb",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/tomazbratanic/anaconda3/lib/python3.11/site-packages/langchain_core/_api/beta_decorator.py:86: LangChainBetaWarning: The function `with_structured_output` is in beta. It is actively being worked on, so the API may change.\n",
      "  warn_beta(\n"
     ]
    }
   ],
   "source": [
    "schema = create_simple_model([\"Business\", \"Individual\"], [\"SUES\"])\n",
    "\n",
    "structured_llm = llm.with_structured_output(schema)\n",
    "chain = prompt | structured_llm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "eb1c5275-a491-4ff2-9d96-bc7c4f6aa049",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Graph(nodes=[Node(id='Elon Musk', type='Individual'), Node(id='OpenAI', type='Business')], relationships=[Relationship(source=Node(id='Elon Musk', type='Individual'), target=Node(id='OpenAI', type='Business'), type='SUES')])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chain.invoke({\"input\": \"Elon Musk is suing OpenAI\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "0ec0a91e-9037-452e-a186-8d26152c6ca2",
   "metadata": {},
   "outputs": [],
   "source": [
    "schema = create_simple_model()\n",
    "\n",
    "structured_llm = llm.with_structured_output(schema)\n",
    "chain = prompt | structured_llm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "0edc70b2-8a14-4bce-9e12-16e263f13fe3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Graph(nodes=[Node(id='Elon Musk', type='Person'), Node(id='OpenAI', type='Organization')], relationships=[Relationship(source=Node(id='Elon Musk', type='Person'), target=Node(id='OpenAI', type='Organization'), type='is suing')])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chain.invoke({\"input\": \"Elon Musk is suing OpenAI\"})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "36334d4f-1f6e-472c-adc0-6d86232b9ea7",
   "metadata": {},
   "source": [
    "# Extended\n",
    "\n",
    "* Each node label is a separate class, so that we can define props per node label\n",
    "* Split into two LLM calls (one for nodes, and one for rels)\n",
    "* not limiting rel types between specific node labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "ad0efc45-8336-40b0-bd7f-127ae9af19c1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'Movie': [{'property': 'url', 'type': 'STRING'},\n",
       "   {'property': 'runtime', 'type': 'INTEGER'},\n",
       "   {'property': 'revenue', 'type': 'INTEGER'},\n",
       "   {'property': 'plotEmbedding', 'type': 'LIST'},\n",
       "   {'property': 'posterEmbedding', 'type': 'LIST'},\n",
       "   {'property': 'imdbRating', 'type': 'FLOAT'},\n",
       "   {'property': 'released', 'type': 'STRING'},\n",
       "   {'property': 'countries', 'type': 'LIST'},\n",
       "   {'property': 'languages', 'type': 'LIST'},\n",
       "   {'property': 'plot', 'type': 'STRING'},\n",
       "   {'property': 'imdbVotes', 'type': 'INTEGER'},\n",
       "   {'property': 'imdbId', 'type': 'STRING'},\n",
       "   {'property': 'year', 'type': 'INTEGER'},\n",
       "   {'property': 'poster', 'type': 'STRING'},\n",
       "   {'property': 'movieId', 'type': 'STRING'},\n",
       "   {'property': 'tmdbId', 'type': 'STRING'},\n",
       "   {'property': 'title', 'type': 'STRING'},\n",
       "   {'property': 'budget', 'type': 'INTEGER'}]},\n",
       " {'Genre': [{'property': 'name', 'type': 'STRING'}]},\n",
       " {'User': [{'property': 'userId', 'type': 'STRING'},\n",
       "   {'property': 'name', 'type': 'STRING'}]},\n",
       " {'Person': [{'property': 'url', 'type': 'STRING'},\n",
       "   {'property': 'bornIn', 'type': 'STRING'},\n",
       "   {'property': 'bio', 'type': 'STRING'},\n",
       "   {'property': 'died', 'type': 'DATE'},\n",
       "   {'property': 'born', 'type': 'DATE'},\n",
       "   {'property': 'imdbId', 'type': 'STRING'},\n",
       "   {'property': 'name', 'type': 'STRING'},\n",
       "   {'property': 'poster', 'type': 'STRING'},\n",
       "   {'property': 'tmdbId', 'type': 'STRING'}]}]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "node_schema = [{k:graph.structured_schema[\"node_props\"][k]} for k in graph.structured_schema[\"node_props\"] if k not in [\"Actor\", \"Director\", \"_Bloom_Perspective_\", \"_Bloom_Scene_\"]]\n",
    "node_schema"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "a509e3c3-445c-4c43-94ff-f90e815e284b",
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import Any, Dict, List, Optional, Type\n",
    "from langchain_core.pydantic_v1 import BaseModel, create_model\n",
    "\n",
    "\n",
    "def json_to_pydantic(json_data: List[Dict[str, List[Dict[str, str]]]]) -> Type[BaseModel]:\n",
    "    type_mapping = {\n",
    "        'STRING': (Optional[str], None),\n",
    "        'INTEGER': (Optional[int], None),\n",
    "        'FLOAT': (Optional[float], None),\n",
    "        'LIST': (Optional[List[str]], None),\n",
    "        'DATE': (Optional[str], None)\n",
    "    }\n",
    "\n",
    "    models = {}\n",
    "\n",
    "    # Generate individual Pydantic models for each class\n",
    "    for entity in json_data:\n",
    "        for class_name, properties in entity.items():\n",
    "            fields = {'id': (Optional[str], None)}\n",
    "            fields.update({prop['property']: type_mapping[prop['type']] for prop in properties})\n",
    "            model = create_model(class_name, **fields)\n",
    "            models[class_name] = model\n",
    "\n",
    "    # Generate a Nodes class containing lists of the above models as attributes\n",
    "    nodes_fields = {model_name: (Optional[List[model]], None) for model_name, model in models.items()}\n",
    "    Nodes = create_model('Nodes', **nodes_fields)\n",
    "\n",
    "    return Nodes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "65d15f39-6bc4-4faa-8e6f-e82aea3db0b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "node_schema_pydantic = json_to_pydantic(node_schema)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "4e096157-caf6-444d-8b20-52dcca0f8f86",
   "metadata": {},
   "outputs": [],
   "source": [
    "structured_llm_nodes = llm.with_structured_output(node_schema_pydantic)\n",
    "nodes_chain = prompt | structured_llm_nodes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "e8a2a55a-9b44-437e-ad7f-372cccbb3dfc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Nodes(Movie=[Movie(id='Dune', url=None, runtime=None, revenue=None, plotEmbedding=None, posterEmbedding=None, imdbRating=None, released=None, countries=['United States'], languages=['English'], plot='Set in the distant future, the film follows Paul Atreides as his family, the noble House Atreides, is thrust into a war for the deadly and inhospitable desert planet Arrakis.', imdbVotes=None, imdbId=None, year=2021, poster=None, movieId=None, tmdbId=None, title=None, budget=None)], Genre=None, User=None, Person=[Person(id='Denis Villeneuve', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Denis Villeneuve', poster=None, tmdbId=None), Person(id='Jon Spaihts', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Jon Spaihts', poster=None, tmdbId=None), Person(id='Eric Roth', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Eric Roth', poster=None, tmdbId=None), Person(id='Frank Herbert', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Frank Herbert', poster=None, tmdbId=None), Person(id='Timothée Chalamet', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Timothée Chalamet', poster=None, tmdbId=None), Person(id='Rebecca Ferguson', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Rebecca Ferguson', poster=None, tmdbId=None), Person(id='Oscar Isaac', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Oscar Isaac', poster=None, tmdbId=None), Person(id='Josh Brolin', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Josh Brolin', poster=None, tmdbId=None), Person(id='Stellan Skarsgård', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Stellan Skarsgård', poster=None, tmdbId=None), Person(id='Dave Bautista', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Dave Bautista', poster=None, tmdbId=None), Person(id='Stephen McKinley Henderson', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Stephen McKinley Henderson', poster=None, tmdbId=None), Person(id='Zendaya', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Zendaya', poster=None, tmdbId=None), Person(id='Chang Chen', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Chang Chen', poster=None, tmdbId=None), Person(id='Sharon Duncan-Brewster', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Sharon Duncan-Brewster', poster=None, tmdbId=None), Person(id='Charlotte Rampling', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Charlotte Rampling', poster=None, tmdbId=None), Person(id='Jason Momoa', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Jason Momoa', poster=None, tmdbId=None), Person(id='Javier Bardem', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Javier Bardem', poster=None, tmdbId=None)])"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text = \"\"\"\n",
    "Dune (titled onscreen as Dune: Part One) is a 2021 American epic science fiction film directed and co-produced by Denis Villeneuve, who co-wrote the screenplay with Jon Spaihts and Eric Roth. It is the first of a two-part adaptation of the 1965 novel of the same name by Frank Herbert. Set in the distant future, the film follows Paul Atreides as his family, the noble House Atreides, is thrust into a war for the deadly and inhospitable desert planet Arrakis. The ensemble cast includes Timothée Chalamet, Rebecca Ferguson, Oscar Isaac, Josh Brolin, Stellan Skarsgård, Dave Bautista, Stephen McKinley Henderson, Zendaya, Chang Chen, Sharon Duncan-Brewster, Charlotte Rampling, Jason Momoa, and Javier Bardem.\n",
    "\"\"\"\n",
    "\n",
    "nodes_chain.invoke({\"input\":text})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4ad880f1-37d3-4d44-bb34-9a49b5d0a602",
   "metadata": {},
   "source": [
    "### Extract rels as step 2 (simple)\n",
    "\n",
    "* Here we hope that id's will be identical to the step 1 (no guarantees)\n",
    "* No rel props"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "f3dc7aec-c8a4-429d-ab3f-9454ccf23c87",
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_relsonly_model(node_labels: Optional[List[str]] = None, rel_types:Optional[List[str]] = None) -> BaseModel:\n",
    "    \"\"\"\n",
    "    Simple model allows to limit node and/or relationship types.\n",
    "    Doesn't have any node or relationship properties.\n",
    "    \"\"\"\n",
    "    class Node(BaseModel):\n",
    "        \"\"\"Represents a node in a graph with associated properties.\n",
    "        \"\"\"\n",
    "        id: str = Field(description=\"A unique identifier for the node.\")\n",
    "        type: str = optional_enum_field(node_labels, description=\"The type or label of the node.\")\n",
    "    \n",
    "    class Relationship(BaseModel):\n",
    "        \"\"\"Represents a directed relationship between two nodes in a graph.\n",
    "        \"\"\"\n",
    "    \n",
    "        source: Node = Field(description=\"The source node of the relationship.\")\n",
    "        target: Node = Field(description=\"The target node of the relationship.\")\n",
    "        type: str = optional_enum_field(rel_types, description=\"The type of the relationship.\")\n",
    "    \n",
    "    class Graph(BaseModel):\n",
    "        \"\"\"Represents a graph document consisting of nodes and relationships.\n",
    "        \"\"\"\n",
    "        relationships: Optional[List[Relationship]] = Field(description=\"List of relationships\")\n",
    "\n",
    "    return Graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "611ad10e-2c33-4726-98a0-2107495d7c0e",
   "metadata": {},
   "outputs": [],
   "source": [
    "node_labels = node_schema = [k for k in graph.structured_schema[\"node_props\"] if k not in [\"Actor\", \"Director\", \"_Bloom_Perspective_\", \"_Bloom_Scene_\"]]\n",
    "rel_types = [k[\"type\"] for k in graph.structured_schema[\"relationships\"] if k not in [\"_Bloom_HAS_SCENE_\"]]\n",
    "rel_schema_pydantic = extract_relsonly_model(node_labels, rel_types)\n",
    "\n",
    "structured_llm_rels = llm.with_structured_output(rel_schema_pydantic)\n",
    "rel_chain = prompt | structured_llm_rels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "cd88dcf0-33be-4132-a6c4-32310d7443ba",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Graph(relationships=[Relationship(source=Node(id='Dune', type='Movie'), target=Node(id='Science Fiction', type='Genre'), type='IN_GENRE'), Relationship(source=Node(id='Denis Villeneuve', type='Person'), target=Node(id='Dune', type='Movie'), type='DIRECTED'), Relationship(source=Node(id='Denis Villeneuve', type='Person'), target=Node(id='Dune', type='Movie'), type='ACTED_IN'), Relationship(source=Node(id='Jon Spaihts', type='Person'), target=Node(id='Dune', type='Movie'), type='ACTED_IN'), Relationship(source=Node(id='Eric Roth', type='Person'), target=Node(id='Dune', type='Movie'), type='ACTED_IN'), Relationship(source=Node(id='Frank Herbert', type='Person'), target=Node(id='Dune', type='Movie'), type='ACTED_IN'), Relationship(source=Node(id='Timothée Chalamet', type='Person'), target=Node(id='Dune', type='Movie'), type='ACTED_IN'), Relationship(source=Node(id='Rebecca Ferguson', type='Person'), target=Node(id='Dune', type='Movie'), type='ACTED_IN'), Relationship(source=Node(id='Oscar Isaac', type='Person'), target=Node(id='Dune', type='Movie'), type='ACTED_IN'), Relationship(source=Node(id='Josh Brolin', type='Person'), target=Node(id='Dune', type='Movie'), type='ACTED_IN'), Relationship(source=Node(id='Stellan Skarsgård', type='Person'), target=Node(id='Dune', type='Movie'), type='ACTED_IN'), Relationship(source=Node(id='Dave Bautista', type='Person'), target=Node(id='Dune', type='Movie'), type='ACTED_IN'), Relationship(source=Node(id='Stephen McKinley Henderson', type='Person'), target=Node(id='Dune', type='Movie'), type='ACTED_IN'), Relationship(source=Node(id='Zendaya', type='Person'), target=Node(id='Dune', type='Movie'), type='ACTED_IN'), Relationship(source=Node(id='Chang Chen', type='Person'), target=Node(id='Dune', type='Movie'), type='ACTED_IN'), Relationship(source=Node(id='Sharon Duncan-Brewster', type='Person'), target=Node(id='Dune', type='Movie'), type='ACTED_IN'), Relationship(source=Node(id='Charlotte Rampling', type='Person'), target=Node(id='Dune', type='Movie'), type='ACTED_IN'), Relationship(source=Node(id='Jason Momoa', type='Person'), target=Node(id='Dune', type='Movie'), type='ACTED_IN'), Relationship(source=Node(id='Javier Bardem', type='Person'), target=Node(id='Dune', type='Movie'), type='ACTED_IN')])"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rel_chain.invoke({\"input\": text})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4ba08e7c-284e-4b0d-be70-019c15269c52",
   "metadata": {},
   "source": [
    "### Extract rels as step 2 (extended)\n",
    "\n",
    "* Here we hope that id's will be identical to the step 1 (no guarantees)\n",
    "* Each rel is a separate class, so we can add props and also between which node labels it appears\n",
    "* Duplicates for multilabeled nodes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "c2e35013-237a-4bfb-b9f2-c3b32a7279c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_relationships_class(input_data: Dict[str, Any]) -> Type[BaseModel]:\n",
    "    rel_props = input_data['rel_props']\n",
    "    relationships = input_data['relationships']\n",
    "\n",
    "    # Create node classes with a required id property\n",
    "    node_classes = {f\"{node_name}Node\": create_model(f\"{node_name}Node\", id=(str, ...)) \n",
    "                    for rel in relationships for node_name in [rel['start'], rel['end']]}\n",
    "\n",
    "    # Initialize relationship models referencing node classes\n",
    "    relationship_models = {}\n",
    "    for rel in relationships:\n",
    "        rel_type=rel[\"type\"]\n",
    "        # Initialize fields with optional source and target references\n",
    "        fields = {'source': (Any, ...), 'target': (Any, ...)}\n",
    "        \n",
    "        # Add optional properties for this relationship type if they exist\n",
    "        additional_props = rel_props.get(rel_type, [])\n",
    "        for prop in additional_props:\n",
    "            prop_type = prop['type']\n",
    "            # Make all additional properties optional\n",
    "            fields[prop['property']] = (Optional[str], None) if prop_type == 'STRING' else \\\n",
    "                                       (Optional[int], None) if prop_type == 'INTEGER' else \\\n",
    "                                       (Optional[float], None) if prop_type == 'FLOAT' else \\\n",
    "                                       (Optional[List[Any]], None)\n",
    "\n",
    "        # Define source and target types based on the first occurrence in relationships\n",
    "        for relationship in relationships:\n",
    "            if relationship['type'] == rel_type:\n",
    "                source_class = node_classes[f\"{relationship['start']}Node\"]\n",
    "                target_class = node_classes[f\"{relationship['end']}Node\"]\n",
    "                fields['source'] = (source_class, ...)\n",
    "                fields['target'] = (target_class, ...)\n",
    "                break  # Assuming one example is enough to define the model\n",
    "        # In order to handle relationship with the same type between different node labels, we need to add source and target info to class name\n",
    "        model_name = rel_type + rel[\"start\"] + rel[\"end\"]\n",
    "        relationship_models[model_name] = create_model(model_name, **fields)\n",
    "\n",
    "    # Define fields for the Relationship class containing lists of each relationship model\n",
    "    relationship_fields = {rel_type: (List[model], []) for rel_type, model in relationship_models.items()}\n",
    "    Relationship = create_model('Relationship', **relationship_fields)\n",
    "\n",
    "    return Relationship"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "aacff063-02ba-4953-85e1-1d58e3886532",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/tomazbratanic/anaconda3/lib/python3.11/site-packages/pydantic/v1/main.py:996: RuntimeWarning: fields may not start with an underscore, ignoring \"_Bloom_HAS_SCENE__Bloom_Perspective__Bloom_Scene_\"\n",
      "  warnings.warn(f'fields may not start with an underscore, ignoring \"{f_name}\"', RuntimeWarning)\n"
     ]
    }
   ],
   "source": [
    "rel_extended_schema = create_relationships_class(graph.structured_schema)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "4fb56b1d-c642-4820-8325-92029f35dddb",
   "metadata": {},
   "outputs": [],
   "source": [
    "structured_llm_rels_extended = llm.with_structured_output(rel_extended_schema)\n",
    "rel_chain_extended = prompt | structured_llm_rels_extended"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "81cc3adf-0cb8-4d61-b8f8-cc54022aa272",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Relationship(IN_GENREMovieGenre=[IN_GENREMovieGenre(source=MovieNode(id='Dune'), target=GenreNode(id='scienceFiction'))], RATEDUserMovie=[], ACTED_INActorMovie=[], DIRECTEDActorMovie=[], DIRECTEDDirectorMovie=[], ACTED_INDirectorMovie=[], ACTED_INPersonMovie=[ACTED_INPersonMovie(source=ActorNode(id='Timothée Chalamet'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Rebecca Ferguson'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Oscar Isaac'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Josh Brolin'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Stellan Skarsgård'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Dave Bautista'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Stephen McKinley Henderson'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Zendaya'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Chang Chen'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Sharon Duncan-Brewster'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Charlotte Rampling'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Jason Momoa'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Javier Bardem'), target=MovieNode(id='Dune'), role=None)], DIRECTEDPersonMovie=[DIRECTEDPersonMovie(source=ActorNode(id='Denis Villeneuve'), target=MovieNode(id='Dune'), role=None)])"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rel_chain_extended.invoke({\"input\": text})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "e3be49bb-8d25-49b2-b190-83c5b6018bfe",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Combining should solve entity disambiguation between nodes and rels (work only with the latest gpt-4)\n",
    "class Graph(BaseModel):\n",
    "    \"\"\"Represents a graph document consisting of nodes and relationships.\n",
    "    \"\"\"\n",
    "    nodes: Optional[node_schema_pydantic]\n",
    "    relationships: Optional[rel_extended_schema]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "dcd0346a-9052-441d-8ca3-d4980b841cba",
   "metadata": {},
   "outputs": [],
   "source": [
    "combined_extended_llm = llm.with_structured_output(Graph)\n",
    "chain_extended_combined = prompt | combined_extended_llm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "d3ba2497-ef24-44b4-98b5-71a52850cc4d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Graph(nodes=Nodes(Movie=[Movie(id='Dune', url=None, runtime=None, revenue=None, plotEmbedding=None, posterEmbedding=None, imdbRating=None, released=None, countries=['United States'], languages=['English'], plot='Set in the distant future, the film follows Paul Atreides as his family, the noble House Atreides, is thrust into a war for the deadly and inhospitable desert planet Arrakis.', imdbVotes=None, imdbId=None, year=2021, poster=None, movieId=None, tmdbId=None, title=None, budget=None)], Genre=None, User=None, Person=[Person(id='Denis Villeneuve', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Denis Villeneuve', poster=None, tmdbId=None), Person(id='Jon Spaihts', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Jon Spaihts', poster=None, tmdbId=None), Person(id='Eric Roth', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Eric Roth', poster=None, tmdbId=None), Person(id='Frank Herbert', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Frank Herbert', poster=None, tmdbId=None), Person(id='Timothée Chalamet', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Timothée Chalamet', poster=None, tmdbId=None), Person(id='Rebecca Ferguson', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Rebecca Ferguson', poster=None, tmdbId=None), Person(id='Oscar Isaac', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Oscar Isaac', poster=None, tmdbId=None), Person(id='Josh Brolin', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Josh Brolin', poster=None, tmdbId=None), Person(id='Stellan Skarsgård', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Stellan Skarsgård', poster=None, tmdbId=None), Person(id='Dave Bautista', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Dave Bautista', poster=None, tmdbId=None), Person(id='Stephen McKinley Henderson', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Stephen McKinley Henderson', poster=None, tmdbId=None), Person(id='Zendaya', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Zendaya', poster=None, tmdbId=None), Person(id='Chang Chen', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Chang Chen', poster=None, tmdbId=None), Person(id='Sharon Duncan-Brewster', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Sharon Duncan-Brewster', poster=None, tmdbId=None), Person(id='Charlotte Rampling', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Charlotte Rampling', poster=None, tmdbId=None), Person(id='Jason Momoa', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Jason Momoa', poster=None, tmdbId=None), Person(id='Javier Bardem', url=None, bornIn=None, bio=None, died=None, born=None, imdbId=None, name='Javier Bardem', poster=None, tmdbId=None)]), relationships=Relationship(IN_GENREMovieGenre=[], RATEDUserMovie=[], ACTED_INActorMovie=[], DIRECTEDActorMovie=[], DIRECTEDDirectorMovie=[], ACTED_INDirectorMovie=[], ACTED_INPersonMovie=[ACTED_INPersonMovie(source=ActorNode(id='Timothée Chalamet'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Rebecca Ferguson'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Oscar Isaac'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Josh Brolin'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Stellan Skarsgård'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Dave Bautista'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Stephen McKinley Henderson'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Zendaya'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Chang Chen'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Sharon Duncan-Brewster'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Charlotte Rampling'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Jason Momoa'), target=MovieNode(id='Dune'), role=None), ACTED_INPersonMovie(source=ActorNode(id='Javier Bardem'), target=MovieNode(id='Dune'), role=None)], DIRECTEDPersonMovie=[DIRECTEDPersonMovie(source=ActorNode(id='Denis Villeneuve'), target=MovieNode(id='Dune'), role=None)]))"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chain_extended_combined.invoke({\"input\": text})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3598febb-d546-4b71-9f62-0a0f72702c25",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
